SE object created from exploratory analysis.
Gene selection according to Biotype already performed
SE_Bio <- readRDS(paste0(params$SE_Bio))Duplicated gene names are dropped and gene IDs are set as rownames.
The number of duplicated GeneName is: 11899
The number of duplicated Ensembl Genes with version is: 0
SE_Bio <- SE_Bio[!duplicated(rowData(SE_Bio)$GeneName), ]
rownames(SE_Bio) <- rowData(SE_Bio)$GeneName
SE_Bio
## class: SummarizedExperiment
## dim: 24708 36
## metadata(0):
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(36): OLE_001 OLE_002 ... OLE_048 OLE_049
## colData names(11): SampleNumber SampleID_GU ... HRID lib.sizeScaledCols <- c('darkblue', "purple","white","lightgoldenrod1", 'goldenrod1')
padj_threshold = params$padj_threshold
log2fc_threshold = params$log2fc_thresholdOnly day50 Cortical Brain Organoids are kept
SE_Bio_d50CbO <- SE_Bio[, colData(SE_Bio)$Timepoint %in% 'd50']
colData(SE_Bio_d50CbO)
## DataFrame with 14 rows and 11 columns
## SampleNumber SampleID_GU SampleID_OG SampleName SampleType Genotype
## <integer> <character> <character> <character> <character> <character>
## OLE_010 7 D50_3a_WT D50_3a_WT ORG_CHD2_WT_round3_d.. CorticalOrganoids WT
## OLE_011 8 D50_3a_HT D50_3a_HT ORG_CHD2_HT_round3_d.. CorticalOrganoids HT
## OLE_016 11 D50_3_WT D50_3_WT ORG_CHD2_WT_round3_d.. CorticalOrganoids WT
## OLE_017 12 D50_3_HT D50_3_HT ORG_CHD2_HT_round3_d.. CorticalOrganoids HT
## OLE_040 27 OLE_040 OL15 15_Evo1_d50_WT3 CorticalOrganoids WT
## ... ... ... ... ... ... ...
## OLE_045 32 OLE_045 OL20 20_Evo3_d50_WTC_1-2 CorticalOrganoids WT
## OLE_046 33 OLE_046 OL21 21_Evo3_d50_A5_1 CorticalOrganoids AR
## OLE_047 34 OLE_047 OL22 22_Evo4_d50_WTC-A CorticalOrganoids WT
## OLE_048 35 OLE_048 OL23 23_Evo4_d50_A5-A CorticalOrganoids AR
## OLE_049 36 OLE_049 OL24 24_Evo4_d50_HET-A CorticalOrganoids HT
## Timepoint Batch SeqRun HRID lib.size
## <character> <character> <integer> <character> <numeric>
## OLE_010 d50 Round3 1 OLE_010 14450007
## OLE_011 d50 Round3 1 OLE_011 29379109
## OLE_016 d50 Round3 1 OLE_016 15451960
## OLE_017 d50 Round3 1 OLE_017 17477648
## OLE_040 d50 Evo1 2 OLE_040 18977971
## ... ... ... ... ... ...
## OLE_045 d50 Evo3 2 OLE_045 18918665
## OLE_046 d50 Evo3 2 OLE_046 12915310
## OLE_047 d50 Evo4 2 OLE_047 25159924
## OLE_048 d50 Evo4 2 OLE_048 17803005
## OLE_049 d50 Evo4 2 OLE_049 20109379CountMatrix <- assays(SE_Bio_d50CbO)$counts
SampleMeta <- DataFrame(colData(SE_Bio_d50CbO))
all(rownames(SampleMeta) == colnames(CountMatrix))
## [1] TRUEdds <- DESeqDataSetFromMatrix(countData=assays(SE_Bio_d50CbO)$counts, DataFrame(colData(SE_Bio_d50CbO)), design = ~Batch+Genotype)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are
## characters, converting to factors
mcols(dds) <- DataFrame(mcols(dds), DataFrame(rowData(SE_Bio_d50CbO)))
dds$Genotype <- factor(dds$Genotype, levels = c("WT", "AR", 'HT')) #no need to specify as column is already ordered, but safer
dds$Genotype <- relevel(dds$Genotype, ref = "WT")
dds$Genotype
## [1] WT HT WT HT WT AR AR HT HT WT AR WT AR HT
## Levels: WT AR HT
dds
## class: DESeqDataSet
## dim: 24708 14
## metadata(1): version
## assays(1): counts
## rownames(24708): TSPAN6 TNMD ... LNCDAT CDR1
## rowData names(9): Gene EnsGene ... Start End
## colnames(14): OLE_010 OLE_011 ... OLE_048 OLE_049
## colData names(11): SampleNumber SampleID_GU ... HRID lib.sizeZeroPlot <- CountMatrix %>%
mutate(row = row_number()) %>%
pivot_longer(cols = -row, names_to = "col", values_to = "value") %>%
filter(value == 0) %>%
group_by(col) %>%
summarise(zerocount = n()) %>%
ggplot(., aes(y=col, x=zerocount)) +
geom_col(col='black', fill='#76c8c8') +
coord_flip() +
geom_label(aes(label=zerocount)) +
labs(title=paste0('Number of genes with zero counts ', '(out of ', nrow(CountMatrix), ' genes)'),
y='', x='') +
theme_bw() +
theme(axis.text = element_text(colour = 'black', size=7),
axis.text.x = element_text(angle=45, hjust = 0.5, vjust=0.5),
plot.title = element_text(hjust = 0.5, size = 7))
ZeroPlotkeep <- rowSums(counts(dds)>=5) >= 2 #changed from 5 ##changed from 10 and 2 samples from 3
table(keep)
## keep
## FALSE TRUE
## 7035 17673
dds <- dds[keep,]
dds
## class: DESeqDataSet
## dim: 17673 14
## metadata(1): version
## assays(1): counts
## rownames(17673): TSPAN6 TNMD ... NPBWR1 PDCD6-AHRR
## rowData names(9): Gene EnsGene ... Start End
## colnames(14): OLE_010 OLE_011 ... OLE_048 OLE_049
## colData names(11): SampleNumber SampleID_GU ... HRID lib.sizeA dds object containing information about 17673 genes in 14 samples is tested for differential expression.
dds <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
sizeFactors(dds)
## OLE_010 OLE_011 OLE_016 OLE_017 OLE_040 OLE_041 OLE_042 OLE_043 OLE_044
## 0.7390996 1.4760943 0.7919811 0.8709701 0.9803307 1.1987549 1.0634332 1.1595120 1.0535215
## OLE_045 OLE_046 OLE_047 OLE_048 OLE_049
## 1.0511518 0.7119825 1.3296681 0.9142719 1.0500573select <- order(rowMeans(counts(dds,normalized=TRUE)),
decreasing=TRUE)[1:50]
ntd <- normTransform(dds)
df <- as.data.frame(colData(dds)[,c("Genotype")])
colnames(df) <- 'Genotype'
#df$Run <- c(rep('rep1', 3), rep('rep2', 3))
rownames(df) <- colnames(ntd)
pheatmap(assay(ntd)[select,], cluster_rows=FALSE, show_rownames=FALSE,
cluster_cols=TRUE, annotation_col=df, border_color = 'black')vsd <- vst(dds, blind=FALSE)
sampleDists <- dist(t(assay(vsd)))
sampleDistMatrix <- as.matrix(sampleDists)
#rownames(sampleDistMatrix) <- paste(vsd$condition, vsd$type, sep="-")
#colnames(sampleDistMatrix) <- NULL
colors <- colorRampPalette(rev(RColorBrewer::brewer.pal(9, "Blues")) )(255)
pheatmap(sampleDistMatrix,
clustering_distance_rows=sampleDists,
clustering_distance_cols=sampleDists,
col=colors, main = 'Samples Distances', name = 'vst')SF <- data.frame(Sample=names(sizeFactors(dds)), SizeF=sizeFactors(dds))
par(mfrow=c(2,2),cex.lab=0.7)
boxplot(log2(counts(dds)), col=ScaledCols, cex.axis=0.7,
las=1, xlab="log2(counts)", horizontal=TRUE, main="Raw counts")
boxplot(log2(counts(dds, normalized=TRUE)), col=ScaledCols, cex.axis=0.7,
las=1, xlab="log2(normalized counts)", horizontal=TRUE, main="Normalized counts")
plotDensity(log2(counts(dds)), col=ScaledCols,
xlab="log2(counts)", cex.lab=0.7, panel.first=grid())
plotDensity(log2(counts(dds, normalized=TRUE)), col=ScaledCols,
xlab="log2(normalized counts)", cex.lab=0.7, panel.first=grid()) plotDispEsts(dds)res_dds_ht <- results(dds, contrast=c("Genotype","HT","WT"), alpha=0.05, cooksCutoff=0.99)
summary(res_dds_ht)
##
## out of 17673 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 2887, 16%
## LFC < 0 (down) : 2714, 15%
## outliers [1] : 0, 0%
## low counts [2] : 343, 1.9%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
metadata(res_dds_ht)$filterThreshold
## 1.938776%
## 2.407767
metadata(res_dds_ht)$alpha
## [1] 0.05
plot(metadata(res_dds_ht)$filterNumRej,
type="b", ylab="number of rejections",
xlab="quantiles of filter", main='Heterozygous',
cex.lab=0.5, cex.axis=0.5, cex.main=0.5)
lines(metadata(res_dds_ht)$lo.fit, col="red")
abline(v=metadata(res_dds_ht)$filterTheta)head(res_dds_ht[order(res_dds_ht$padj),])
## log2 fold change (MLE): Genotype HT vs WT
## Wald test p-value: Genotype HT vs WT
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## ZNF117 2766.734 -7.54332 0.1549884 -48.6703 0.00000e+00 0.00000e+00
## ERV3-1 1492.936 -7.53693 0.1759269 -42.8412 0.00000e+00 0.00000e+00
## ZNF562 382.685 -4.36925 0.1900274 -22.9927 5.51293e-117 3.18464e-113
## INPP5F 2050.753 1.52067 0.0686312 22.1572 8.89688e-109 3.85457e-105
## ZNF273 235.836 -6.33985 0.3182608 -19.9203 2.71381e-88 9.40606e-85
## ZNF471 418.291 -4.90022 0.2575796 -19.0241 1.07736e-80 3.11177e-77Genes modulated considering a FDR threshold of 0.1: 6638
Genes modulated considering a FDR threshold of 0.05: 5601
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 2202
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 1136
Since I am using the constrast option to retrieve the results, I cannot rely on apleglm default algorithm for logFC shrinkage. Since at the moment I am not using the lfcThreshold option, I decide for the ashr algorithm.
resAshr_ht <- lfcShrink(dds, contrast=c("Genotype","HT","WT"), res=res_dds_ht, type="ashr")
## using 'ashr' for LFC shrinkage. If used in published research, please cite:
## Stephens, M. (2016) False discovery rates: a new deal. Biostatistics, 18:2.
## https://doi.org/10.1093/biostatistics/kxw041
summary(resAshr_ht)
##
## out of 17673 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 2887, 16%
## LFC < 0 (down) : 2714, 15%
## outliers [1] : 0, 0%
## low counts [2] : 343, 1.9%
## (mean count < 2)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results#par(mfrow=c(1,2), mar=c(4,4,2,1))
xlim <- c(1,1e5); ylim <- c(-6,6)
DESeq2::plotMA(res_dds_ht, xlim=xlim, ylim=ylim, main="no LFC shrink")DESeq2::plotMA(resAshr_ht, xlim=xlim, ylim=ylim, main="LFC shrink with ashr algorithm")head(resAshr_ht[order(resAshr_ht$padj),])
## log2 fold change (MMSE): Genotype HT vs WT
## Wald test p-value: Genotype HT vs WT
## DataFrame with 6 rows and 5 columns
## baseMean log2FoldChange lfcSE pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric>
## ZNF117 2766.734 -7.53678 0.1549485 0.00000e+00 0.00000e+00
## ERV3-1 1492.936 -7.52850 0.1758691 0.00000e+00 0.00000e+00
## ZNF562 382.685 -4.35450 0.1904841 5.51293e-117 3.18464e-113
## INPP5F 2050.753 1.50850 0.0685866 8.89688e-109 3.85457e-105
## ZNF273 235.836 -6.31315 0.3184744 2.71381e-88 9.40606e-85
## ZNF471 418.291 -4.87726 0.2584989 1.07736e-80 3.11177e-77Genes modulated considering a FDR threshold of 0.1: 6638
Genes modulated considering a FDR threshold of 0.05: 5601
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 1.5: 1744
Genes modulated considering a FDR threshold of 0.05 and FC threshold of 2: 708
Log Fold change shrinkage doesn’t change the results significantly, thus I decided to keep the standard DESeq2 workflow for testing differential gene expression.
deseqDEGsHT <- dplyr::filter(data.frame(res_dds_ht), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))
deseqDEGsHTashr <- dplyr::filter(data.frame(resAshr_ht), padj < padj_threshold, abs(log2FoldChange) > log2(log2fc_threshold))SE_deseq2 <- as(dds, "RangedSummarizedExperiment")
assays(SE_deseq2)$vst <- assay(vst(dds, blind=TRUE))FdrCeil = 1e-10
logFcCeil = 7.5#rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'HT vs WT')
dplyr::rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'HT vs WT (day50 CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)#rename(as.data.frame(res_dds_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = 0.05, LabellingCutoff = 0.01, FCCutoff = 2, main = 'HT vs WT')
dplyr::rename(as.data.frame(resAshr_ht), logFC = log2FoldChange, FDR = padj) %>% VolcanoTiltedNodash(. , AdjustedCutoff = padj_threshold, LabellingCutoff = 0.01, FCCutoff = log2fc_threshold, main = 'HT vs WT (day50 CO)') + labs(y='Log FoldChange') + xlim(0, -log10(FdrCeil)) + ylim(-logFcCeil, logFcCeil)DEAList <- list()
DEAList <- list(dds = dds, #same for both
HT = list(res = res_dds_ht,
DEGs = deseqDEGsHT,
resAshr =resAshr_ht,
DEGsAshr = deseqDEGsHTashr))# RDS
saveRDS(DEAList, paste0(SavingFolder, '/day50CbO.', 'DEAList_HT.rds'))
saveRDS(SE_deseq2, paste0(SavingFolder, '/day50CbO.', 'SE_deseq2_HT.rds'))
saveRDS(res_dds_ht, paste0(SavingFolder, '/day50CbO.', 'deseqHTvsWT.rds'))
SessionInfo <- sessionInfo()
Date <- date()
save.image(paste0(SavingFolder, '/day50CbO.', 'DEAnalysisWorkspace_HT.RData'))